{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Tensorflow Word Embeddings From Scratch\n",
    "- Using CBOW model\n",
    "- References:\n",
    "    - https://gist.github.com/yxtay/a94d971955d901c4690129580a4eafb9\n",
    "    - https://github.com/huseinzol05/Text-Classification-Comparison/blob/master/preparation/word-vector.ipynb\n",
    "    - The difference with husein model is that he trains Skip-gram and I train a CBOW model. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import tensorflow as tf\n",
    "import pandas as pd\n",
    "import numpy as np\n",
    "import time\n",
    "from sklearn.preprocessing import LabelEncoder\n",
    "import re\n",
    "import collections\n",
    "import random\n",
    "import time\n",
    "import os\n",
    "import helpers.pickle_helpers as ph\n",
    "import math\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Generating raw vocabulary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "train_data = ph.load_from_pickle(directory=\"data/husein_emotion/emotion-english/merged_training.pkl\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "def clearstring(string):\n",
    "    string = re.sub('[^\\'\\\"A-Za-z0-9 ]+', '', string)\n",
    "    string = string.split(' ')\n",
    "    string = filter(None, string)\n",
    "    string = [y.strip() for y in string]\n",
    "    string = [y for y in string if len(y) > 3 and y.find('nbsp') < 0]\n",
    "    return ' '.join(string)\n",
    "\n",
    "def read_data():\n",
    "    e_dir = 'data/husein_emotion/emotion-english/data/'\n",
    "    list_folder = os.listdir(e_dir)\n",
    "    label = list_folder\n",
    "    label.sort()\n",
    "    outer_string, outer_label = [], []\n",
    "    for i in range(len(list_folder)):\n",
    "        list_file = os.listdir(e_dir + list_folder[i])\n",
    "        strings = []\n",
    "        for x in range(len(list_file)):\n",
    "            with open(e_dir + list_folder[i] + '/' + list_file[x], 'r') as fopen:\n",
    "                strings += fopen.read().split('\\n')\n",
    "        strings = list(filter(None, strings))\n",
    "        for k in range(len(strings)):\n",
    "            strings[k] = clearstring(strings[k])\n",
    "        labels = [i] * len(strings)\n",
    "        outer_string += strings\n",
    "        outer_label += labels\n",
    "    \n",
    "    dataset = np.array([outer_string, outer_label])\n",
    "    dataset = dataset.T\n",
    "    np.random.shuffle(dataset)\n",
    "    \n",
    "    string = []\n",
    "    for i in range(dataset.shape[0]):\n",
    "        string += dataset[i][0].split()\n",
    "    \n",
    "    return string, dataset, label\n",
    "\n",
    "def read_data_with_pandas():\n",
    "    \"\"\" I already converted the data into pandas to we can avoid the function above\"\"\"\n",
    "    vocab = []\n",
    "    text = train_data.text.values.tolist()\n",
    "    for t in text:\n",
    "        strings = clearstring(t)\n",
    "        vocab+=strings.split()\n",
    "    return vocab"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "#vocabulary, dataset, label = read_data()\n",
    "vocabulary = read_data_with_pandas()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "example 10 words: ['feel', 'awful', 'about', 'because', 'position', 'succeed', 'just', 'didn', 'happen', 'here']\n",
      "size corpus: 4433712\n",
      "size of unique words: 71554\n"
     ]
    }
   ],
   "source": [
    "print(\"example 10 words:\", vocabulary[:10])\n",
    "print('size corpus:',len(vocabulary))\n",
    "vocabulary_size = len(list(set(vocabulary)))\n",
    "print('size of unique words:',vocabulary_size)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Build Dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "def build_dataset(words, n_words):\n",
    "    count = [['UNK', -1]]\n",
    "    count.extend(collections.Counter(words).most_common(n_words - 1))\n",
    "    dictionary = dict()\n",
    "    for word, _ in count:\n",
    "        dictionary[word] = len(dictionary) # increase index as words added\n",
    "    data = list()\n",
    "    unk_count = 0\n",
    "    for word in words:\n",
    "        if word in dictionary:\n",
    "            index = dictionary[word]\n",
    "        else:\n",
    "            index = 0  # dictionary['UNK']\n",
    "            unk_count += 1\n",
    "        data.append(index)\n",
    "    count[0][1] = unk_count\n",
    "    reverse_dictionary = dict(zip(dictionary.values(), dictionary.keys()))\n",
    "    return data, count, dictionary, reverse_dictionary"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "data, count, dictionary, reverse_dictionary = build_dataset(vocabulary, vocabulary_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "del vocabulary # reduces memory"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Most common words (+UNK) [['UNK', 1], ('feel', 289939), ('feeling', 134185), ('that', 130733), ('like', 73972)]\n",
      "Sample data [1, 370, 8, 11, 1016, 2713, 9, 176, 434, 72, 140, 1, 370, 228, 1193] ['feel', 'awful', 'about', 'because', 'position', 'succeed', 'just', 'didn', 'happen', 'here', 'alone', 'feel', 'awful', 'probably', 'mentioned']\n"
     ]
    }
   ],
   "source": [
    "print('Most common words (+UNK)', count[:5])\n",
    "print('Sample data', data[:15], [reverse_dictionary[i] for i in data[:15]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4433712"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Generating Batches for the CBOW model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "data_index = 0\n",
    "\n",
    "def generate_batch(batch_size, context_window):\n",
    "    # all context tokens should be used, hence no associated num_skips argument\n",
    "    global data_index\n",
    "    context_size = 2 * context_window\n",
    "    batch = np.ndarray(shape=(batch_size, context_size), dtype=np.int32)\n",
    "    labels = np.ndarray(shape=(batch_size, 1), dtype=np.int32)\n",
    "    span = 2 * context_window + 1  # [ context_window target context_window ]\n",
    "    buffer = collections.deque(maxlen=span)\n",
    "    for _ in range(span):\n",
    "        buffer.append(data[data_index])\n",
    "        data_index = (data_index + 1) % len(data)\n",
    "    for i in range(batch_size):\n",
    "        # context tokens are just all the tokens in buffer except the target\n",
    "        batch[i, :] = [token for idx, token in enumerate(buffer) if idx != context_window]\n",
    "        labels[i, 0] = buffer[context_window]\n",
    "        buffer.append(data[data_index])\n",
    "        data_index = (data_index + 1) % len(data)\n",
    "    data_index-=1\n",
    "    return batch, labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch, labels = generate_batch(batch_size=8, context_window=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1 feel 8 about -> 370 awful\n",
      "370 awful 11 because -> 8 about\n",
      "8 about 1016 position -> 11 because\n",
      "11 because 2713 succeed -> 1016 position\n",
      "1016 position 9 just -> 2713 succeed\n",
      "2713 succeed 176 didn -> 9 just\n",
      "9 just 434 happen -> 176 didn\n",
      "176 didn 72 here -> 434 happen\n"
     ]
    }
   ],
   "source": [
    "for i in range(8):\n",
    "    print(batch[i, 0], reverse_dictionary[batch[i, 0]],\n",
    "          batch[i, 1], reverse_dictionary[batch[i, 1]],\n",
    "          '->', labels[i, 0], reverse_dictionary[labels[i, 0]])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[1,\n",
       " 370,\n",
       " 8,\n",
       " 11,\n",
       " 1016,\n",
       " 2713,\n",
       " 9,\n",
       " 176,\n",
       " 434,\n",
       " 72,\n",
       " 140,\n",
       " 1,\n",
       " 370,\n",
       " 228,\n",
       " 1193,\n",
       " 6,\n",
       " 99,\n",
       " 13,\n",
       " 1,\n",
       " 297,\n",
       " 17,\n",
       " 105,\n",
       " 967,\n",
       " 7,\n",
       " 144,\n",
       " 4320,\n",
       " 4800,\n",
       " 3190,\n",
       " 1340,\n",
       " 2]"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "list(data[:30])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "(array([[   1,    8],\n",
       "        [ 370,   11],\n",
       "        [   8, 1016],\n",
       "        [  11, 2713],\n",
       "        [1016,    9],\n",
       "        [2713,  176],\n",
       "        [   9,  434],\n",
       "        [ 176,   72]], dtype=int32), array([[ 370],\n",
       "        [   8],\n",
       "        [  11],\n",
       "        [1016],\n",
       "        [2713],\n",
       "        [   9],\n",
       "        [ 176],\n",
       "        [ 434]], dtype=int32), None)"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "batch, labels, print(data_index)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 128\n",
    "embedding_size = 128  # Dimension of the embedding vector.\n",
    "context_window = 1  # How many words to consider left and right.\n",
    "context_size = 2 * context_window\n",
    "\n",
    "valid_size = 16\n",
    "valid_window = 100\n",
    "valid_examples = np.random.choice(valid_window, valid_size, replace=False)\n",
    "num_sampled = 64 \n",
    "\n",
    "data_index = 0 # reset index"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([33, 42, 61, 31, 11, 27, 98, 50, 48, 83, 76, 26,  0,  2,  7, 38])"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "valid_examples"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [],
   "source": [
    "graph = tf.Graph()\n",
    "\n",
    "with graph.as_default():\n",
    "    # Input data.\n",
    "    train_inputs = tf.placeholder(tf.int32, shape=[batch_size, context_size])\n",
    "    train_labels = tf.placeholder(tf.int32, shape=[batch_size, 1])\n",
    "    valid_dataset = tf.constant(valid_examples, dtype=tf.int32)\n",
    "    \n",
    "    with tf.device('/cpu:0'):\n",
    "        # Look up embeddings for inputs.\n",
    "        embeddings = tf.Variable(\n",
    "            tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))\n",
    "        embed = tf.nn.embedding_lookup(embeddings, train_inputs)\n",
    "        # take mean of embeddings of context words for context embedding\n",
    "        embed_context = tf.reduce_mean(embed, 1)\n",
    "\n",
    "        # Construct the variables for the NCE loss\n",
    "        nce_weights = tf.Variable(\n",
    "            tf.truncated_normal([vocabulary_size, embedding_size],\n",
    "                                stddev=1.0 / np.sqrt(embedding_size)))\n",
    "        nce_biases = tf.Variable(tf.zeros([vocabulary_size]))\n",
    "    \n",
    "    loss = tf.reduce_mean(\n",
    "        tf.nn.nce_loss(nce_weights, nce_biases, train_labels, embed_context,\n",
    "                       num_sampled, vocabulary_size))\n",
    "\n",
    "    # Construct the SGD optimizer using a learning rate of 1.0.\n",
    "    optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(loss)\n",
    "    \n",
    "    # Compute the cosine similarity between minibatch examples and all embeddings.\n",
    "    norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))\n",
    "    normalized_embeddings = embeddings / norm\n",
    "    valid_embeddings = tf.nn.embedding_lookup(\n",
    "        normalized_embeddings, valid_dataset)\n",
    "    similarity = tf.matmul(\n",
    "        valid_embeddings, normalized_embeddings, transpose_b=True)\n",
    "\n",
    "    # Add variable initializer.\n",
    "    init = tf.global_variables_initializer()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Training"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Initialized\n",
      "Average loss at step  0 :  299.08343505859375\n",
      "Nearest to much: chel, minion, rian, niceee, harmonizes, thou, gillan, interrupts,\n",
      "Nearest to could: ceviche, reseach, lifestyles, references, rasputins, latimers, befuddles, sealer,\n",
      "Nearest to work: munich, intake, sweetie, gwens, tolong, synch, agitated, into,\n",
      "Nearest to some: choice, hashtag, shropshire, atoshealthcare, refferred, unimportance, succeeded, sabar,\n",
      "Nearest to because: noche, afte, wwii, boarder, timeisaidsomething, metronomy, maniacofmagic, miiiiight,\n",
      "Nearest to want: hoie, prostration, carseat, nagalene, vinage, harrumping, vistas, frightened,\n",
      "Nearest to person: possible, quetions, christasthostoraoraret, underprivledged, memories, splayed, ashers, dreamcatcher,\n",
      "Nearest to their: beind, thumped, commodore, daven, libers, jiggle, yowling, meuan,\n",
      "Nearest to always: lovechild, yoma, qualitative, battlefield, cheesy, palet, seris, emitting,\n",
      "Nearest to through: raynaud, baskets, grasps, subcontracted, ablowing, lilli, roadway, sethe,\n",
      "Nearest to those: clarion, mally, unschooling, filmic, pathological, terrib, shavuot, negotiating,\n",
      "Nearest to would: poetics, merrier, flitwick, truthiness, shovels, jessieh, crabs, cheers,\n",
      "Nearest to UNK: thomas, grandma, panadols, sidetracks, illuminati, johannson, framing, abilities,\n",
      "Nearest to feeling: czech, prabhu, reliant, mcewan, ghostte, dummer, unscrupulousness, affirm,\n",
      "Nearest to with: jacy, fuckity, recuperate, masha, tranquilizes, attaching, treyarch, purposeful,\n",
      "Nearest to make: palestinians, intensifies, counter, enthralled, salwar, medicinals, culpability, mach,\n",
      "Average loss at step  2000 :  125.22336650371551\n",
      "Average loss at step  4000 :  63.271121943473815\n",
      "Average loss at step  6000 :  42.7147980966568\n",
      "Average loss at step  8000 :  31.526810986042022\n",
      "Average loss at step  10000 :  23.63127056312561\n",
      "Nearest to much: lived, thrilled, while, prepared, breath, expect, wants, fact,\n",
      "Nearest to could: security, loving, would, help, shes, energetic, ugly, become,\n",
      "Nearest to work: comfortable, agitated, wrong, into, intake, imagine, generous, wife,\n",
      "Nearest to some: choice, both, songs, local, these, attention, honest, young,\n",
      "Nearest to because: that, wear, think, just, when, following, mark, really,\n",
      "Nearest to want: frightened, cant, know, driving, ours, focus, forget, going,\n",
      "Nearest to person: possible, memories, sign, jaded, simple, upon, dead, environment,\n",
      "Nearest to their: seen, please, self, damaged, while, bloody, directly, photography,\n",
      "Nearest to always: give, birthday, process, wouldnt, stand, still, return, should,\n",
      "Nearest to through: father, word, over, interested, caring, satisfied, preparing, title,\n",
      "Nearest to those: piece, answer, thank, tomatoes, worse, christ, these, people,\n",
      "Nearest to would: habits, could, suffering, grateful, will, event, window, road,\n",
      "Nearest to UNK: thomas, framing, grandma, unloved, ridden, panadols, abilities, sidetracks,\n",
      "Nearest to feeling: feel, thinking, although, feels, constant, become, mixed, shared,\n",
      "Nearest to with: computer, exercise, abit, missed, welcome, rearing, determined, into,\n",
      "Nearest to make: enthralled, irritated, counter, look, popular, take, color, apprehensive,\n",
      "Average loss at step  12000 :  18.970122277021407\n",
      "Average loss at step  14000 :  15.580851222515106\n",
      "Average loss at step  16000 :  12.93910071492195\n",
      "Average loss at step  18000 :  11.18920814061165\n",
      "Average loss at step  20000 :  10.008783981800079\n",
      "Nearest to much: thou, prepared, positive, lived, expect, budget, album, thrilled,\n",
      "Nearest to could: would, security, loving, washed, cant, kindness, reputation, shes,\n",
      "Nearest to work: intake, safely, solemn, increasingly, crimes, wrong, imagine, generous,\n",
      "Nearest to some: choice, these, local, both, songs, surprise, lost, dressed,\n",
      "Nearest to because: that, think, wear, when, following, video, mark, secure,\n",
      "Nearest to want: need, cant, know, recovering, ours, frightened, forget, route,\n",
      "Nearest to person: possible, memories, round, sign, flaws, presentation, polite, annoyed,\n",
      "Nearest to their: your, self, directly, please, sentence, photography, damaged, bloody,\n",
      "Nearest to always: give, still, wouldnt, birthday, stand, might, process, christian,\n",
      "Nearest to through: after, over, father, ultimately, baskets, into, remarks, event,\n",
      "Nearest to those: these, piece, tomatoes, answer, christ, thank, worse, lower,\n",
      "Nearest to would: could, will, should, habits, grateful, event, need, window,\n",
      "Nearest to UNK: thomas, framing, grandma, unloved, ridden, illuminati, panadols, reunited,\n",
      "Nearest to feeling: feel, feels, thinking, colored, mixed, span, become, theater,\n",
      "Nearest to with: computer, daddy, exercise, abit, welcome, into, determined, formula,\n",
      "Nearest to make: enthralled, counter, take, envy, help, palestinians, interview, popular,\n",
      "Average loss at step  22000 :  8.972736934423446\n",
      "Average loss at step  24000 :  8.290186167001725\n",
      "Average loss at step  26000 :  7.402682531118393\n",
      "Average loss at step  28000 :  6.9990194275379185\n",
      "Average loss at step  30000 :  6.572344171524048\n",
      "Nearest to much: prepared, thou, positive, budget, girlfriends, confident, fact, central,\n",
      "Nearest to could: would, security, cant, washed, wanted, will, reputation, loving,\n",
      "Nearest to work: intake, solemn, safely, winner, insanely, comfortable, esque, increasingly,\n",
      "Nearest to some: choice, these, local, both, surprise, pace, many, same,\n",
      "Nearest to because: that, wear, think, secure, video, mark, lonely, meeting,\n",
      "Nearest to want: need, cant, wanted, recovering, know, trying, itchy, ours,\n",
      "Nearest to person: possible, memories, flaws, polite, round, sign, bath, district,\n",
      "Nearest to their: your, photography, directly, self, sentence, damaged, love, decorations,\n",
      "Nearest to always: still, give, wouldnt, birthday, might, process, expecting, friendship,\n",
      "Nearest to through: into, after, over, ultimately, remarks, father, baskets, seek,\n",
      "Nearest to those: these, tomatoes, piece, people, drawer, downs, christ, thank,\n",
      "Nearest to would: will, could, should, habits, grateful, need, finds, truck,\n",
      "Nearest to UNK: thomas, framing, grandma, unloved, ridden, illuminati, rebellion, panadols,\n",
      "Nearest to feeling: feel, feels, colored, thinking, william, span, mixed, theater,\n",
      "Nearest to with: computer, heroine, daddy, welcome, exercise, formula, abit, bump,\n",
      "Nearest to make: envy, enthralled, take, help, counter, protective, breads, tell,\n",
      "Average loss at step  32000 :  6.378906316995621\n",
      "Average loss at step  34000 :  6.23798601782322\n",
      "Average loss at step  36000 :  5.7834570453166965\n",
      "Average loss at step  38000 :  5.594382426977157\n",
      "Average loss at step  40000 :  5.46941244840622\n",
      "Nearest to much: thou, prepared, positive, effectively, budget, central, finding, girlfriends,\n",
      "Nearest to could: would, cant, security, wanted, does, cannot, couldnt, washed,\n",
      "Nearest to work: intake, safely, solemn, winner, insanely, sweetie, alcohol, esque,\n",
      "Nearest to some: choice, these, local, both, sony, many, same, surprise,\n",
      "Nearest to because: wear, that, think, lonely, secure, shaking, which, video,\n",
      "Nearest to want: need, wanted, recovering, cant, itchy, trying, audience, know,\n",
      "Nearest to person: possible, memories, polite, round, flaws, district, bath, sign,\n",
      "Nearest to their: your, photography, self, directly, damaged, decent, nowadays, decorations,\n",
      "Nearest to always: still, wouldnt, give, never, might, display, expecting, birthday,\n",
      "Nearest to through: into, remarks, ultimately, over, baskets, after, seek, bike,\n",
      "Nearest to those: these, tomatoes, many, other, downs, christ, drawer, piece,\n",
      "Nearest to would: could, will, should, bridge, neighbours, finds, truck, need,\n",
      "Nearest to UNK: thomas, framing, unloved, grandma, ridden, illuminati, lamented, rebellion,\n",
      "Nearest to feeling: feel, colored, feels, thinking, william, theater, assume, knowing,\n",
      "Nearest to with: computer, heroine, abit, welcome, daddy, insurance, bump, videos,\n",
      "Nearest to make: envy, help, counter, take, breads, enthralled, makes, palestinians,\n",
      "Average loss at step  42000 :  5.395769045352936\n",
      "Average loss at step  44000 :  5.243148478984833\n",
      "Average loss at step  46000 :  5.1700991148948665\n",
      "Average loss at step  48000 :  5.117234437704086\n",
      "Average loss at step  50000 :  5.085444109678268\n",
      "Nearest to much: thou, prepared, effectively, positive, budget, central, finding, fact,\n",
      "Nearest to could: would, cant, cannot, security, wanted, couldnt, does, doesnt,\n",
      "Nearest to work: intake, safely, insanely, sweetie, winner, solemn, working, alcohol,\n",
      "Nearest to some: these, choice, many, sony, certain, both, local, same,\n",
      "Nearest to because: that, think, wear, which, secure, lonely, mildly, mark,\n",
      "Nearest to want: need, wanted, recovering, trying, itchy, cant, audience, know,\n",
      "Nearest to person: possible, district, polite, flaws, round, memories, blissful, thing,\n",
      "Nearest to their: your, photography, directly, buffalo, self, decorations, nowadays, cheeks,\n",
      "Nearest to always: still, wouldnt, never, give, friendship, display, flaunt, expecting,\n",
      "Nearest to through: into, baskets, remarks, over, ultimately, after, seek, staring,\n",
      "Nearest to those: these, many, tomatoes, downs, other, christ, magnitude, people,\n",
      "Nearest to would: will, could, should, might, neighbours, bridge, compromise, truck,\n",
      "Nearest to UNK: thomas, framing, unloved, grandma, ridden, illuminati, lamented, abilities,\n",
      "Nearest to feeling: feel, colored, william, sans, feels, knowing, thinking, disappointing,\n",
      "Nearest to with: computer, heroine, daddy, abit, welcome, insurance, bump, wonderful,\n",
      "Nearest to make: makes, making, envy, help, made, counter, take, tell,\n",
      "Average loss at step  52000 :  5.045123442411422\n",
      "Average loss at step  54000 :  5.023891132354736\n",
      "Average loss at step  56000 :  5.02401539504528\n",
      "Average loss at step  58000 :  4.915226219415665\n",
      "Average loss at step  60000 :  4.950072752714157\n",
      "Nearest to much: thou, prepared, effectively, positive, central, budget, gillan, girlfriends,\n",
      "Nearest to could: would, cannot, cant, couldnt, wanted, does, security, doesnt,\n",
      "Nearest to work: safely, intake, working, sweetie, insanely, winner, solemn, fulfilled,\n",
      "Nearest to some: these, choice, certain, many, sony, pace, mule, both,\n",
      "Nearest to because: wear, that, secure, lonely, think, which, meeting, nonetheless,\n",
      "Nearest to want: need, wanted, recovering, trying, itchy, audience, cant, causes,\n",
      "Nearest to person: district, thing, memories, blissful, flaws, polite, possible, round,\n",
      "Nearest to their: your, photography, buffalo, lack, self, directly, cheeks, decorations,\n",
      "Nearest to always: still, wouldnt, never, flaunt, display, give, friendship, literally,\n",
      "Nearest to through: into, baskets, remarks, fantastical, ultimately, snoring, over, lecture,\n",
      "Nearest to those: these, many, tomatoes, downs, other, magnitude, yang, piece,\n",
      "Nearest to would: will, could, should, might, neighbours, compromise, bridge, truck,\n",
      "Nearest to UNK: thomas, framing, unloved, grandma, ridden, illuminati, lamented, abilities,\n",
      "Nearest to feeling: feel, william, knowing, feels, colored, assume, sans, thinking,\n",
      "Nearest to with: computer, heroine, daddy, bump, welcome, loneliness, insurance, abit,\n",
      "Nearest to make: making, makes, made, envy, help, take, breads, palestinians,\n",
      "Average loss at step  62000 :  4.868242716789245\n",
      "Average loss at step  64000 :  4.878298313379288\n",
      "Average loss at step  66000 :  4.853499369859695\n",
      "Average loss at step  68000 :  4.840205312728882\n",
      "Average loss at step  70000 :  4.727915882587433\n",
      "Nearest to much: thou, prepared, effectively, gillan, central, announcing, positive, budget,\n",
      "Nearest to could: would, cannot, couldnt, cant, wanted, does, security, might,\n",
      "Nearest to work: working, sweetie, intake, safely, winner, insanely, technically, surge,\n",
      "Nearest to some: these, choice, certain, sony, many, local, mule, same,\n",
      "Nearest to because: that, wear, generating, secure, mildly, lonely, since, which,\n",
      "Nearest to want: need, wanted, recovering, itchy, wanna, audience, trying, cant,\n",
      "Nearest to person: thing, district, blissful, flaws, polite, possible, memories, excesses,\n",
      "Nearest to their: your, photography, buffalo, cheeks, lack, directly, decorations, sweden,\n",
      "Nearest to always: still, never, wouldnt, literally, flaunt, probably, give, fiercely,\n",
      "Nearest to through: into, remarks, baskets, fantastical, wrenching, lecture, snoring, staring,\n",
      "Nearest to those: these, many, other, tomatoes, downs, yang, magnitude, christ,\n",
      "Nearest to would: will, could, should, might, neighbours, bridge, doesnt, finds,\n",
      "Nearest to UNK: thomas, framing, unloved, grandma, ridden, illuminati, lamented, abilities,\n",
      "Nearest to feeling: feel, william, colored, knowing, thinking, feels, assume, thigh,\n",
      "Nearest to with: heroine, computer, bump, insurance, daddy, abit, loneliness, confidently,\n",
      "Nearest to make: making, made, makes, help, envy, breads, palestinians, protective,\n",
      "Average loss at step  72000 :  4.678675398468971\n",
      "Average loss at step  74000 :  4.672134338021278\n",
      "Average loss at step  76000 :  4.6569824306964875\n",
      "Average loss at step  78000 :  4.644964736104011\n",
      "Average loss at step  80000 :  4.640530920028686\n",
      "Nearest to much: thou, prepared, effectively, gillan, announcing, central, budget, eighteen,\n",
      "Nearest to could: would, cannot, couldnt, cant, wanted, does, might, security,\n",
      "Nearest to work: working, intake, sweetie, insanely, safely, fulfilled, winner, alcohol,\n",
      "Nearest to some: these, choice, many, certain, sony, mule, both, lakes,\n",
      "Nearest to because: think, mildly, since, which, generating, wear, lonely, that,\n",
      "Nearest to want: need, wanted, recovering, wanna, itchy, trying, audience, causes,\n",
      "Nearest to person: thing, district, blissful, flaws, polite, possible, disappeared, score,\n",
      "Nearest to their: your, photography, buffalo, lack, cheeks, directly, decorations, hollywood,\n",
      "Nearest to always: still, never, wouldnt, only, probably, literally, flaunt, fiercely,\n",
      "Nearest to through: into, fantastical, lecture, baskets, wrenching, staring, snoring, remarks,\n",
      "Nearest to those: these, many, other, tomatoes, downs, magnitude, yang, christ,\n",
      "Nearest to would: will, could, might, should, neighbours, compromise, seemed, shouldnt,\n",
      "Nearest to UNK: thomas, framing, unloved, grandma, ridden, illuminati, lamented, abilities,\n",
      "Nearest to feeling: feel, william, knowing, colored, correction, thinking, para, assume,\n",
      "Nearest to with: heroine, computer, insurance, daddy, bump, loneliness, featured, abit,\n",
      "Nearest to make: making, made, makes, breads, envy, help, palestinians, take,\n",
      "Average loss at step  82000 :  4.6239968521595\n",
      "Average loss at step  84000 :  4.617230604171753\n",
      "Average loss at step  86000 :  4.607437689781189\n",
      "Average loss at step  88000 :  4.616980379104614\n",
      "Average loss at step  90000 :  4.6137591066360475\n",
      "Nearest to much: thou, prepared, effectively, announcing, gillan, central, appealing, budget,\n",
      "Nearest to could: would, cannot, couldnt, cant, wanted, does, might, doesnt,\n",
      "Nearest to work: working, sweetie, safely, intake, fulfilled, insanely, surge, struggle,\n",
      "Nearest to some: these, choice, certain, sony, many, mule, lakes, egon,\n",
      "Nearest to because: that, since, secure, nonetheless, think, generating, lonely, meeting,\n",
      "Nearest to want: wanted, need, wanna, recovering, itchy, trying, audience, please,\n",
      "Nearest to person: thing, district, blissful, flaws, excesses, polite, disappeared, score,\n",
      "Nearest to their: your, photography, buffalo, cheeks, lack, mused, directly, severity,\n",
      "Nearest to always: still, wouldnt, never, literally, shah, probably, fiercely, display,\n",
      "Nearest to through: into, fantastical, baskets, wrenching, staring, snoring, lecture, remarks,\n",
      "Nearest to those: these, many, tomatoes, other, magnitude, downs, yang, christ,\n",
      "Nearest to would: will, could, might, should, neighbours, seemed, shouldnt, compromise,\n",
      "Nearest to UNK: thomas, framing, unloved, grandma, illuminati, ridden, lamented, abilities,\n",
      "Nearest to feeling: feel, william, knowing, disappointing, correction, colored, para, orgasm,\n",
      "Nearest to with: heroine, computer, loneliness, insurance, sweetness, daddy, bump, featured,\n",
      "Nearest to make: making, made, makes, breads, help, envy, palestinians, weirdest,\n",
      "Average loss at step  92000 :  4.588345262765884\n",
      "Average loss at step  94000 :  4.578663722872734\n",
      "Average loss at step  96000 :  4.56554067504406\n",
      "Average loss at step  98000 :  4.569180638074875\n",
      "Average loss at step  100000 :  4.563093585252762\n",
      "Nearest to much: thou, prepared, announcing, effectively, gillan, central, appealing, fixed,\n",
      "Nearest to could: would, cannot, couldnt, cant, wanted, does, might, should,\n",
      "Nearest to work: working, sweetie, safely, intake, insanely, surge, winner, fulfilled,\n",
      "Nearest to some: these, choice, certain, mule, sony, many, lakes, egon,\n",
      "Nearest to because: since, that, generating, which, mildly, nonetheless, dengan, stroke,\n",
      "Nearest to want: wanted, need, wanna, recovering, itchy, audience, trying, cant,\n",
      "Nearest to person: thing, district, blissful, flaws, excesses, polite, disappeared, score,\n",
      "Nearest to their: your, photography, buffalo, cheeks, mused, severity, lack, document,\n",
      "Nearest to always: still, never, wouldnt, shah, fiercely, longer, literally, flaunt,\n",
      "Nearest to through: into, fantastical, wrenching, snoring, baskets, staring, lecture, supreme,\n",
      "Nearest to those: these, many, tomatoes, other, downs, yang, magnitude, number,\n",
      "Nearest to would: will, could, might, should, neighbours, seemed, doesnt, compromise,\n",
      "Nearest to UNK: thomas, framing, grandma, illuminati, unloved, ridden, lamented, abilities,\n",
      "Nearest to feeling: feel, william, assume, knowing, thigh, feels, disappointing, para,\n",
      "Nearest to with: heroine, insurance, computer, bump, sweetness, loneliness, daddy, confidently,\n",
      "Nearest to make: making, made, makes, envy, breads, help, weirdest, palestinians,\n"
     ]
    }
   ],
   "source": [
    "num_steps = 100001\n",
    "with tf.Session(graph=graph) as session:\n",
    "    # We must initialize all variables before we use them.\n",
    "    init.run()\n",
    "    print(\"Initialized\")\n",
    "\n",
    "    average_loss = 0\n",
    "    for step in range(num_steps):\n",
    "        batch_inputs, batch_labels = generate_batch(\n",
    "            batch_size, context_window)\n",
    "        feed_dict = {train_inputs: batch_inputs, train_labels: batch_labels}\n",
    "\n",
    "        _, loss_val = session.run([optimizer, loss], feed_dict=feed_dict)\n",
    "        average_loss += loss_val\n",
    "\n",
    "        if step % 2000 == 0:\n",
    "            if step > 0:\n",
    "                average_loss /= 2000\n",
    "            # The average loss is an estimate of the loss over the last 2000 batches.\n",
    "            print(\"Average loss at step \", step, \": \", average_loss)\n",
    "            average_loss = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "array([[ 0.03698784,  0.08871643, -0.0046615 , ..., -0.14359537,\n",
       "        -0.09679344,  0.03453588],\n",
       "       [ 0.00217088, -0.00746337, -0.02105754, ...,  0.04860028,\n",
       "        -0.05745085, -0.11272517],\n",
       "       [-0.01719761, -0.01997992, -0.04020466, ..., -0.11894673,\n",
       "        -0.08256418, -0.04724454],\n",
       "       ...,\n",
       "       [-0.10864564,  0.14505424, -0.04585417, ...,  0.09261709,\n",
       "         0.11069414,  0.10313071],\n",
       "       [ 0.07042108,  0.13556236, -0.0210914 , ...,  0.08755381,\n",
       "         0.08562793,  0.12323809],\n",
       "       [ 0.15020965,  0.06738573, -0.09012128, ...,  0.1060191 ,\n",
       "        -0.02045754, -0.13858506]], dtype=float32)"
      ]
     },
     "execution_count": 26,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_embeddings"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "ph.convert_to_pickle(directory=\"data/husein_emotion/tf_embeddings/tf_cbow_embeddings.p\", item=final_embeddings)\n",
    "ph.convert_to_pickle(directory=\"data/husein_emotion/tf_embeddings/tf_cbow_dictionary.p\", item=dictionary)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
